###############################################                                     
# Cause of Effect? Turnout in Hispanic Majority-Minority Districts
#  - John A. Henderson, Jasjeet S. Sekhon, and Rocio Titiunik
#  - Forthcoming in Political Analysis
#  - Replication file for <Figure XII>
#  - April 14, 2016
###############################################

############################################################################
#
## GENERATES TABLE XII: Balance RESULTS FOR 2000 Open Seats
#
###########################################################################
        
options(width=150)
rm(list=ls())   

path='/local/'  
source(paste(path,'replicationPA/funs/headers.R',sep=''))
   
set.seed(57459)

# LOAD BASELINE AND MATCHED DATA
            
#~/ethnicity/open_seats/output/RData.WH-data-open_seats
#~/ethnicity/open_seats/output/final_runs/Matched-open_seats-trimmed.csv

#load(paste(path,"RData.WH-data-open_seats",sep='')); WH2data <- data    
load(paste(path,'replicationPA/data/open_data/OpenBaseline-aftermatching.Rdata',sep=''))
WH2data <- WHdata      

Sdata <-  read.csv(file = paste(path,"replicationPA/data/open_data/Matched-open_seats-trimmed.csv",sep=''), header = TRUE, stringsAsFactors = FALSE)
                                                                    
# cheap matching; roughly get at bias   
WH2data=WH2data[order(WH2data$DisTri_1991),] 
       
indtr=matrix(NA,length(which(WH2data$Tr==TRUE)),2) 
indtr[,1]=which(WH2data$Tr==TRUE)
indtr[,2]=WH2data$DisTri_1991[indtr[,1]]    
indtr=indtr[order(indtr[,2]),]
indct=matrix(NA,nrow(indtr),2)                

                        

distrip=WH2data$DisTri_1991     
und=sort(unique(distrip))  
sps=c() 
spa=c()
for(i in 1:length(und)){
	ind=which(distrip==und[i] & WH2data$Tr==FALSE) 
	inda=which(distrip==und[i] & WH2data$Tr==TRUE)
	if(length(ind)>0){ 
		len=length(which(distrip==und[i] & WH2data$Tr==TRUE))
		sps=c(sps,sort(sample(ind,replace=T,size=len))) 
		spa=c(spa,inda)
	}
}   

#indtr[,1]=spa
#indct[,1]=sps
#indct[,2]=distrip[sps]

indt=c(spa,sps)
WH2data=WH2data[indt,]       

WH2data$phh_income99_39less  = WH2data$phh_income99_0to19    + WH2data$phh_income99_20to39
WH2data$phh_income99_40to74  = WH2data$phh_income99_40to59   + WH2data$phh_income99_60to74
WH2data$phh_income99_100plus = WH2data$phh_income99_100to199 + WH2data$phh_income99_200
WH2data$ppop_foreign         = WH2data$ppop_foreign_naturcit + WH2data$ppop_foreign_nocit

WH2data$y98_reg_nhisp		 = WH2data$y98_reg_tot - WH2data$y98_reg_hisp 
WH2data$y98_preg_nhisp		 = WH2data$y98_reg_nhisp/WH2data$y98_reg_tot

Sdata$y98_reg_nhisp			 = Sdata$y98_reg_tot - Sdata$y98_reg_hisp
Sdata$y98_preg_nhisp		 = Sdata$y98_reg_nhisp/Sdata$y98_reg_tot

# balance covariates

# Conditioning/Balance set: group most important variables first
# HERE
imp.vars <- data.frame(
Sdata$vap,
Sdata$ppop_black18,
Sdata$ppop_hispanic18,
Sdata$phh_income99_39less,
Sdata$phh_income99_40to74,   
Sdata$phh_income99_75to99,
Sdata$phh_income99_100plus,                   
Sdata$ppop_25_hsless,
Sdata$ppop_foreign, 
Sdata$ppop_foreign_naturcit,
Sdata$ppop_foreign_nocit
)               

# Registration variables: include only 1998
reg.vars <- data.frame(
Sdata$y98_preg_tot,                                              
Sdata$y98_preg_hisp, # no turnout info for 1998, so this is the closest we have to previous outcome
Sdata$y98_preg_nhisp,
Sdata$y98_preg_dem,
Sdata$y98_preg_rep,
Sdata$y00_preg_tot,
Sdata$y00_preg_dem,
Sdata$y00_preg_rep                      
)
dim(reg.vars)
cat("Final dimension of reg.vars: ", dim(reg.vars), "\n")

# Vote variables

vote.vars <- data.frame(
Sdata$y98_pvote_ussdem,        # statewide and local offices
Sdata$y98_pvote_govdem,
Sdata$y98_pvote_cngdem,                        
Sdata$y98_pvote_assdem,
Sdata$y98_pvote_atgdem,
Sdata$y00_pvote_ussdem,        # statewide and local offices
Sdata$y00_pvote_cngdem,
Sdata$y00_vote_pprsdem
)

dim(vote.vars)
cat("Final dimension of vote.vars: ", dim(vote.vars), "\n")

# factorize triplets for balance tests
un_trip=unique(Sdata$DisTri_1991)
triplet=matrix(0,dim(Sdata)[1],length(un_trip))  
for(j in 1:length(un_trip)){
	indx=which(Sdata$DisTri_1991==un_trip[j])
	if(length(indx)>0){
		triplet[indx,j]=1  
	}
}
triplet=as.data.frame(triplet)
colnames=names(triplet)=as.character(un_trip)           

# Population variables
pop.vars <- data.frame(
Sdata$ppop_fem,
Sdata$ppop_25to44,
Sdata$ppop_45to59,
Sdata$ppop_70older
#as.factor(Sdata$DisTri_1991)
#triplet
)
cat("Final dimension of pop.vars: ", dim(pop.vars), "\n")


SXall <- data.frame (imp.vars,reg.vars, vote.vars, pop.vars)  # exclude Sdata$DisTri_1991 since we will keep one unique triplet in every file
#X <- data.frame (imp.vars)  # exclude Sdata$DisTri_1991 since we will keep one unique triplet in every file
#B <- X
dim(SXall)
   
# Conditioning/Balance set: group most important variables first
# WHdata 
        
# factorize triplets for balance tests
un_trip=unique(WH2data$DisTri_1991)
triplet=matrix(0,dim(WH2data)[1],length(un_trip))  
for(j in 1:length(un_trip)){
	indx=which(WH2data$DisTri_1991==un_trip[j])
	if(length(indx)>0){
		triplet[indx,j]=1  
	}
}
triplet=as.data.frame(triplet)
colnames=names(triplet)=as.character(un_trip)


imp.vars <- data.frame(
WH2data$vap,
WH2data$ppop_black18,
WH2data$ppop_hispanic18,
WH2data$phh_income99_39less,
WH2data$phh_income99_40to74,  
WH2data$phh_income99_75to99,
WH2data$phh_income99_100plus,                   
WH2data$ppop_25_hsless,
WH2data$ppop_foreign, 
WH2data$ppop_foreign_naturcit,
WH2data$ppop_foreign_nocit
)               

# Registration variables: include only 1998
reg.vars <- data.frame(
WH2data$y98_preg_tot,                                              
WH2data$y98_preg_hisp, # no turnout info for 1998, so this is the closest we have to previous outcome
WH2data$y98_preg_nhisp,
WH2data$y98_preg_dem,
WH2data$y98_preg_rep, 
WH2data$y00_preg_tot,
WH2data$y00_preg_dem,
WH2data$y00_preg_rep                      
)
dim(reg.vars)
cat("Final dimension of reg.vars: ", dim(reg.vars), "\n")

# Vote variables

vote.vars <- data.frame(
WH2data$y98_pvote_ussdem,        # statewide and local offices
WH2data$y98_pvote_govdem,
WH2data$y98_pvote_cngdem,                        
WH2data$y98_pvote_assdem,
WH2data$y98_pvote_atgdem,    
WH2data$y00_pvote_ussdem,        # statewide and local offices
WH2data$y00_pvote_cngdem,
WH2data$y00_vote_pprsdem
)

dim(vote.vars)
cat("Final dimension of vote.vars: ", dim(vote.vars), "\n")

# Population variables
pop.vars <- data.frame(
WH2data$ppop_fem,
WH2data$ppop_25to44,
WH2data$ppop_45to59,
WH2data$ppop_70older
#as.factor(WH2data$DisTri_1991)
#triplet
)
cat("Final dimension of pop.vars: ", dim(pop.vars), "\n")

WXall <- data.frame (imp.vars,reg.vars, vote.vars, pop.vars)  # exclude Sdata$DisTri_1991 since we will keep one unique triplet in every file
#X <- data.frame (imp.vars)  # exclude Sdata$DisTri_1991 since we will keep one unique triplet in every file
#B <- X
dim(WXall)

  

#####################################################################################
## Generate balance matrices for WH1data and Wh2data
#####################################################################################
#source("generate-balance-matrices.R")

#####################################################################################
## Generate turnout and registration rates using different denominators for Sdata, WH2data and WH1data
#####################################################################################
#source("generate-turnout-measures.R")

######################################################

# Run GenMatchStack to get matched dataset 

######################################################
  
######################################################
### Results matrices
######################################################
BALrownm <- colnames(WXall)
BALcolnm1 <- c("Mean Tr WH2-mat","Mean Co WH2-mat","Diff means p-val WH2-mat","KS test p-val WH2-mat")
BALcolnm2 <- c("Mean Tr AM","Mean Co AM","Diff means p-val AM","KS test p-val AM")

RESBAL1 <- matrix(data=NA,nrow=length(BALrownm), ncol=length(BALcolnm1), dimnames = list(BALrownm,BALcolnm1))
RESBAL2 <- matrix(data=NA,nrow=length(BALrownm), ncol=length(BALcolnm2), dimnames = list(BALrownm,BALcolnm2))


######################
# Sample sizes
#####################
dim(WH2data)
table(WH2data$Tr)

dim(Sdata)
table(Sdata$Tr)


######################################################
# load indices to fix measurement bias problem
######################################################

#load('~/share/ethnicity/California/final-runs/qqmatch_index-PPOP18_Final.Rdata')

######################################################
### Balance tests
######################################################

nboots = 0
#$ks$ks$p.value
#$ks$ks.boot.pvalue

########################
## before matching: 
#######################

Tr <- WH2data$Tr    
m1=c()
m1$index.treated = which(Tr==TRUE)
m1$index.control = which(Tr==FALSE)

TrWH2 <- Tr[c(m1$index.treated,m1$index.control)]
XWH2all_base <- WXall[c(m1$index.treated,m1$index.control),]

mb <- MatchBalance(TrWH2 ~ as.matrix(XWH2all_base), ks=TRUE, nboots=nboots)

for(i in 1:length(BALrownm)) {
   varnm = BALrownm[i]
   RESBAL1[varnm,c("Mean Tr WH2-mat","Mean Co WH2-mat","Diff means p-val WH2-mat", "KS test p-val WH2-mat")] <-
   c(mb$BeforeMatching[[i]]$mean.Tr,mb$BeforeMatching[[i]]$mean.Co,mb$BeforeMatching[[i]]$p.value,mb$BeforeMatching[[i]]$ks$ks$p.value)
}
    

########################
## after matching: 
#######################


Tr <- Sdata$Tr
m3=c()
m3$index.treated = which(Tr==TRUE)
m3$index.control = which(Tr==FALSE) 

TrS <- Tr[c(m3$index.treated,m3$index.control)]
Xall_base <- SXall[c(m3$index.treated,m3$index.control),]

mb <- MatchBalance(TrS ~ as.matrix(Xall_base), ks=TRUE, nboots=nboots)

for(i in 1:length(BALrownm)) {
   varnm = BALrownm[i]
	RESBAL2[varnm,c("Mean Tr AM","Mean Co AM","Diff means p-val AM", "KS test p-val AM")] <-   
   c(mb$BeforeMatching[[i]]$mean.Tr,mb$BeforeMatching[[i]]$mean.Co,mb$BeforeMatching[[i]]$p.value,mb$BeforeMatching[[i]]$ks$ks$p.value)
}


######################################################
### Print and convert to latex
######################################################
print(RESBAL1)
#print(xtable(round(RESBAL1,digits=4)), type="latex", file="tables-WH-RESBAL1-HVAP-PPOP18.tex", append=FALSE, caption.placement="top",
#      latex.environments=c("center"),tabular.environment = "tabular", size=NULL, NA.string = "", include.rownames=TRUE,include.colnames=TRUE)

print(RESBAL2)
#print(xtable(round(RESBAL2,digits=4)), type="latex", file="tables-WH-RESBAL2-HVAP-PPOP18.tex", append=FALSE, caption.placement="top",
#      latex.environments=c("center"),tabular.environment = "tabular", size=NULL, NA.string = "", include.rownames=TRUE,include.colnames=TRUE)
           
#print(RESBAL3)
#print(xtable(round(RESBAL3,digits=4)), type="latex", file="tables-WH-RESBAL3-HVAP-PPOP18.tex", append=FALSE, caption.placement="top",
#      latex.environments=c("center"),tabular.environment = "tabular", size=NULL, NA.string = "", include.rownames=TRUE,include.colnames=TRUE)

#print(RESBAL4)
#print(xtable(round(RESBAL4,digits=4)), type="latex", file="tables-WH-RESBAL4-HVAP-PPOP18.tex", append=FALSE, caption.placement="top",
#      latex.environments=c("center"),tabular.environment = "tabular", size=NULL, NA.string = "", include.rownames=TRUE,include.colnames=TRUE)

#print(RESBAL5)
#print(xtable(round(RESBAL5,digits=4)), type="latex", file="tables-WH-RESBAL5-HVAP-PPOP18.tex", append=FALSE, caption.placement="top",
#      latex.environments=c("center"),tabular.environment = "tabular", size=NULL, NA.string = "", include.rownames=TRUE,include.colnames=TRUE)
                  

####### END HERE